This assignment has been completed by Rahul Paul Gopireddy (#801307911) and Aryan Reddy Baddam (#801311891)¶

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('./data/netflix_titles.csv')

1. Select: Allow users to select data points interactively.¶

Notes¶

  • Type: Scatter plot.
  • Justification: Users can click on data points to select them.
In [2]:
# Function to extract numeric value from duration
def extract_duration(duration):
    if isinstance(duration, float):
        return duration
    else:
        try:
            return int(duration.split()[0])
        except ValueError:
            return None
    
# Convert duration to numeric
df['duration_numeric'] = df['duration'].apply(extract_duration)

# Separate TV shows and movies
current_year = pd.Timestamp.now().year
last_20_years_df = df[df['release_year'] >= current_year - 20]

# Separate TV shows and movies
tv_shows_df = last_20_years_df[last_20_years_df['type'] == 'TV Show']
movies_df = last_20_years_df[last_20_years_df['type'] == 'Movie']

# Plotting TV shows
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(tv_shows_df['release_year'], tv_shows_df['duration_numeric'])
plt.xlabel('Release Year')
plt.ylabel('Duration (Seasons)')
plt.title('TV Shows (Last 20 Years)')


# Plotting movies
plt.subplot(1, 2, 2)
plt.scatter(movies_df['release_year'], movies_df['duration_numeric'])
plt.xlabel('Release Year')
plt.ylabel('Duration (Minutes)')
plt.title('Movies (Last 20 Years)')



plt.tight_layout()
plt.show()
In [ ]:
 

2. Explore: Enable zooming and panning to explore data in detail.¶

Notes¶

  • Type: Any plot that supports zooming and panning (e.g., scatter plot, line plot).
  • Justification: Users can zoom in/out and pan to explore specific regions of the plot.
In [3]:
import plotly.express as px

# fig = px.scatter(df, x='release_year', y='country')
# fig.update_layout(title='Explore Data', xaxis_title='Release Year', yaxis_title='Country release' )
# fig.show()

fig = px.scatter(df, x='release_year', y='country', title='Explore Data',
                 labels={'release_year': 'Release Year', 'country': 'Country release'}, hover_data={'title': False},
                 hover_name='title')

# Update layout
fig.update_layout(title='Explore Data', xaxis_title='Release Year', yaxis_title='Country release')

# Show plot
fig.show()

# Hover on the plot to get the options to zoom

3. Reconfigure: Allow users to change the Director.¶

¶

  • Type: Dropdown menu to select different directors.
  • Justification:to see the time line of movies released and number of movies directed by selected directed.
In [4]:
from ipywidgets import interact, Dropdown

# Create a list of unique directors
directors = df['director'].unique()

# Set the default director
default_director = "Spike Lee"

def plot_movies_by_director(selected_director=default_director):
    if selected_director is None:
        return

    movies = df[df['director'] == selected_director][['title', 'release_year']]
    
    # Plotting the count of movies by director
    plt.figure(figsize=(16, 8))
    ax1 = plt.subplot(2, 1, 1)
    movies_count = len(movies)
    ax1.bar(selected_director, movies_count, color='skyblue')
    ax1.set_ylabel('Number of Movies')
    ax1.set_title(f"Number of Movies Directed by {selected_director}")

    # Plotting the release years of movies by director
    ax2 = plt.subplot(2, 1, 2)
    ax2.plot(movies['title'], movies['release_year'], marker='o', color='orange', linestyle='-')
    ax2.set_ylabel('Release Year')
    ax2.set_xlabel('Movie Title')
    ax2.set_title(f"Release Years of Movies Directed by {selected_director}")
    ax2.grid(True)
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

# Create interactive dropdown menu
interact(plot_movies_by_director, selected_director=Dropdown(options=directors, value=default_director))
interactive(children=(Dropdown(description='selected_director', index=81, options=('Kirsten Johnson', nan, 'Ju…
Out[4]:
<function __main__.plot_movies_by_director(selected_director='Spike Lee')>

4. Encode: Use different visual properties (color, size, shape) to encode additional dimensions.¶

¶

  • test world cloud in windows
  • Type: Scatter plot with color/size encoding.
  • Justification: Color/size can represent an additional categorical/numerical variable.
In [5]:
from wordcloud import WordCloud
# Filter out NaN values and split directors by comm

# Path to a TrueType font file
font_path = "./data/Lato-Regular.ttf"

# Directors data
directors = df['director'].dropna().str.split(', ')

# Calculate the count of movies directed by each director
director_counts = directors.explode().value_counts().to_dict()

# Generate the word cloud with count in the label
text = ' '.join([f"{director} ({director_counts[director]})" for director_list in directors for director in director_list])
wordcloud = WordCloud(width=800, height=400, background_color='white', font_path=font_path).generate(text)

# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
In [ ]:
 

5. Abstract: Use aggregation or summarization to show a higher-level view of the data.¶

¶

  • Type: Pie chart.
  • Justification: Shows the distribution of categories in a compact form.
In [6]:
# Calculate country counts for TV shows and movies
tv_shows = df[df['type'] == 'TV Show']
movies = df[df['type'] == 'Movie']

tv_shows_country_counts = tv_shows['country'].value_counts(normalize=True) * 100
movies_country_counts = movies['country'].value_counts(normalize=True) * 100

# Function to plot pie chart with custom label
def plot_pie_chart(country_counts, title):
    threshold = 5
    other_countries = country_counts[country_counts < threshold]
    major_countries = country_counts[country_counts >= threshold]
    major_countries['Other'] = other_countries.sum()

    plt.figure()
    major_countries.plot.pie(autopct=lambda p: '{:.1f}%'.format(p) if p >= threshold else '', label='', colors=plt.cm.tab20.colors)
    plt.title(title)
    plt.ylabel('')
    plt.show()

# Plotting pie charts
plot_pie_chart(tv_shows_country_counts, 'TV Shows by Country')
plot_pie_chart(movies_country_counts, 'Movies by Country')

6. Elaborate: Provide additional details on demand (e.g., tooltips).¶

¶

  • Type: Interactive plot with tooltips.
  • Justification: Users can hover over data points to see additional information.
In [7]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter(x=df['release_year'], y=df['duration'], mode='markers',
                                 text=df['title'], marker=dict(size=8))])
fig.update_layout(title='Elaborate with Tooltips', xaxis_title='Release Year', yaxis_title='Duration')
fig.show()

7. Filter: Allow users to filter data based on specific criteria.¶

¶

  • Type: Bar graph
  • Justification: Users can see the directors who have directed the most , right now threshold of filter is >= 10 for movies and >= 2 for tv shows
In [8]:
# Split directors and count the occurrences
tv_directors = directors[df['type'] == 'TV Show']
movie_directors = directors[df['type'] == 'Movie']

# Calculate the count of movies directed by each director for TV shows and movies
tv_director_counts = tv_directors.explode().value_counts()
movie_director_counts = movie_directors.explode().value_counts()


# Filter directors with at least 10 occurrences
filtered_tv_directors = tv_director_counts[tv_director_counts >= 2]
filtered_movie_directors = movie_director_counts[movie_director_counts >= 10]

# Plotting histograms
fig, axs = plt.subplots(1, 2, figsize=(16, 6))

axs[0].bar(filtered_tv_directors.index, filtered_tv_directors.values, color='skyblue')
axs[0].set_xlabel('Director')
axs[0].set_ylabel('Frequency')
axs[0].set_title('Frequency of Directors in TV Shows (>= 2 occurrences)')
axs[0].tick_params(axis='x', rotation=45)

axs[1].bar(filtered_movie_directors.index, filtered_movie_directors.values, color='salmon')
axs[1].set_xlabel('Director')
axs[1].set_ylabel('Frequency')
axs[1].set_title('Frequency of Directors in Movies (>= 10 occurrences)')
axs[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

8. Connect: Show relationships between different data points.¶

¶

  • Type: Network graph.
  • Justification: To analyse the tv shows and movies and their rating , with release year to check the popularity of geners of the decade.

Reference Used: https://medium.com/codex/how-to-automatically-generate-data-structure-for-sankey-diagrams-6082e332139f

In [9]:
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('./data/netflix_titles.csv')

def map_year_to_decade(year):
    return str(year // 10 * 10) + 's'

# Apply the function to create the new column
df['Decade'] = df['release_year'].apply(map_year_to_decade)



def data_snakey(data, path, value_col):
    sankey_data = {
        'label': [],
        'source': [],
        'target': [],
        'value': []
    }
    counter = 0
    while (counter < len(path) - 1):
        for parent in data[path[counter]].unique():
            sankey_data['label'].append(str(parent))  # Convert to string
            for sub in data[data[path[counter]] == parent][path[counter + 1]].unique():
                sankey_data['source'].append(sankey_data['label'].index(str(parent)))  # Convert to string
                sankey_data['label'].append(str(sub))  # Convert to string
                sankey_data['target'].append(sankey_data['label'].index(str(sub)))  # Convert to string
                sankey_data['value'].append(str(data[data[path[counter + 1]] == sub][value_col].sum()))  # Convert to string

        counter += 1
    return sankey_data

con_data = data_snakey(df, [ 'type', 'rating', 'Decade'], 'release_year')


df

fig = go.Figure(data=[go.Sankey(
node = dict(
  pad = 15,
  thickness = 20,
  line = dict(color = "black", width = 0.5),
  label = con_data['label'],

),
link = dict(
  source = con_data['source'],
  target = con_data['target'],
  value = con_data['value']
))
])
fig.update_layout(height=700,margin={'t':0,'b':0})
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: